import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import shapiro
import math

# Load your dataset
df = pd.read_csv(r"D:\RENJIN RAJU\MASTERS\SEMESTER 3\THESIS\Research\Analysis\Survey data cleaned.csv",
                 encoding='latin-1')

# Ensure 'resilience_score' is created if it doesn't exist
if 'resilience_score' not in df.columns:
    # Assuming resilience_score is a combination of response_score and collaboration_score
    # You can modify this if the calculation logic is different
    response_cols = ['response_Increased_Inventory_Buffers', 'response_Diversified_Suppliers',
                     'response_Implemented_New_Technologies', 'response_Other']
    collab_cols = ['resilience_collaboration_Cloud-based_platforms',
                   'resilience_collaboration_Blockchain_for_transparency',
                   'resilience_collaboration_AI_powered_predictive_analytics',
                   'resilience_collaboration_IoT_for_real-time_tracking',
                   'resilience_collaboration_Digital_twin_Simulations']

    df['response_score'] = df[response_cols].sum(axis=1)
    df['collaboration_score'] = df[collab_cols].sum(axis=1)
    df['resilience_score'] = (df['response_score'] + df['collaboration_score']) / 2

# List of numerical columns to check for distribution
numerical_columns = [
    'Disruption_Global_Crises', 'Disruption_Natural_Disasters', 'Disruption_Supplier_Failures',
    'Disruption_Transportation_Delays', 'Disruption_Cybersecurity_Threats', 'adoption_Robotic_and_Automation',
    'adoption_Big_Data_Analytics', 'adoption_Artificial_Intelligence_AI', 'adoption_Internet_of_Things_IoT',
    'adoption_Simulation', 'adoption_Other', 'adoption_Blockchain', 'adoption_Cloud_Computing',
    'resilience_score', 'response_score', 'collaboration_score'
]


# Create plots to show the distribution of these variables
def plot_distributions(df, columns):
    num_plots = len(columns)
    num_rows = math.ceil(num_plots / 4)  # Calculate rows needed for the grid
    plt.figure(figsize=(15, num_rows * 4))  # Adjust figure size based on number of rows

    for i, col in enumerate(columns):
        plt.subplot(num_rows, 4, i + 1)  # Dynamically calculate subplot index
        sns.histplot(df[col].dropna(), kde=True)
        plt.title(f"Distribution of {col}")

    plt.tight_layout()
    plt.show()


# Perform Shapiro-Wilk test for normality on each numerical column
def check_normality(df, columns):
    normality_results = {}
    for col in columns:
        stat, p_value = shapiro(df[col].dropna())  # Drop missing values
        normality_results[col] = {'statistic': stat, 'p_value': p_value, 'normal': p_value > 0.05}
    return normality_results


# Plot the distributions
plot_distributions(df, numerical_columns)

# Check for normality of each variable
normality_results = check_normality(df, numerical_columns)

# Display the results of the normality test
for col, result in normality_results.items():
    print(
        f"{col}: Statistic = {result['statistic']:.3f}, p-value = {result['p_value']:.4f}, Normal? {'Yes' if result['normal'] else 'No'}")
